import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pypandoc
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn import linear_model
import matplotlib.pyplot as plt
from matplotlib import rcParams
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import PolynomialFeatures
auto = pd.read_csv('Auto.csv')
fig = px.scatter_matrix(auto)
fig.show()
correlation_cols = set(auto.columns)
correlation_cols.remove('name')
corr = auto[correlation_cols].corr()
corr.style.background_gradient(cmap='coolwarm')
auto['horsepower'] = pd.to_numeric(auto['horsepower'],errors='coerce')
auto = auto.dropna(axis=0,how='any')
input_cols = correlation_cols
input_cols.remove('mpg')
auto.shape
input_cols
X = auto[input_cols]
y = auto[['mpg']]
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
predictions = model.predict(X)
model.summary()
Yes, there is a relationship between the predictors and the response based on the r-squared value of 0.821, which shows that 82% of the mpg response can be explained by the predictors in the model. Some predictors don't have as much statistical significance to the response though, based on their p-values.
Displacement, Weight, Origin and Year appear to have a statistically significant relationship to the response based on their p-values.
If all other predictors held constant, the mpg increases by 0.75 for every increase in year.
residuals = model.resid
fitted = model.fittedvalues
smoothed = lowess(residuals,fitted)
fig, ax = plt.subplots()
ax.scatter(fitted, residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Residuals')
ax.set_xlabel('Fitted Values')
ax.set_title('Residuals vs. Fitted')
ax.plot([min(fitted),max(fitted)],[0,0],color = 'k',linestyle = ':', alpha = .3)
This plot shows a non-linear relationship between the response and the predictors
sorted_student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
sorted_student_residuals.index = model.resid.index
sorted_student_residuals = sorted_student_residuals.sort_values(ascending = True)
df = pd.DataFrame(sorted_student_residuals)
df.columns = ['sorted_student_residuals']
df['theoretical_quantiles'] = stats.probplot(df['sorted_student_residuals'], dist = 'norm', fit = False)[0]
rankings = abs(df['sorted_student_residuals']).sort_values(ascending = False)
fig, ax = plt.subplots()
x = df['theoretical_quantiles']
y = df['sorted_student_residuals']
ax.scatter(x,y, edgecolor = 'k',facecolor = 'none')
ax.set_title('Normal Q-Q')
ax.set_ylabel('Standardized Residuals')
ax.set_xlabel('Theoretical Quantiles')
ax.plot([np.min([x,y]),np.max([x,y])],[np.min([x,y]),np.max([x,y])], color = 'r', ls = '--')
This plot shows that the data shows a mostly normal distribution but slightly right skewed
student_residuals = model.get_influence().resid_studentized_internal
sqrt_student_residuals = pd.Series(np.sqrt(np.abs(student_residuals)))
sqrt_student_residuals.index = model.resid.index
smoothed = lowess(sqrt_student_residuals,fitted)
fig, ax = plt.subplots()
ax.scatter(fitted, sqrt_student_residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('$\sqrt{|Studentized \ Residuals|}$')
ax.set_xlabel('Fitted Values')
ax.set_title('Scale-Location')
ax.set_ylim(0,max(sqrt_student_residuals)+0.1)
This plot shows that the residuals are randomly spread across the range of the predictors.
student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
student_residuals.index = model.resid.index
df = pd.DataFrame(student_residuals)
df.columns = ['student_residuals']
df['leverage'] = model.get_influence().hat_matrix_diag
smoothed = lowess(df['student_residuals'],df['leverage'])
sorted_student_residuals = abs(df['student_residuals']).sort_values(ascending = False)
fig, ax = plt.subplots()
x = df['leverage']
y = df['student_residuals']
xpos = max(x)+max(x)*0.01
ax.scatter(x, y, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Studentized Residuals')
ax.set_xlabel('Leverage')
ax.set_title('Residuals vs. Leverage')
ax.set_ylim(min(y)-min(y)*0.15,max(y)+max(y)*0.15)
ax.set_xlim(-0.01,max(x)+max(x)*0.05)
plt.tight_layout()
cooksx = np.linspace(min(x), xpos, 50)
p = len(model.params)
poscooks1y = np.sqrt((p*(1-cooksx))/cooksx)
poscooks05y = np.sqrt(0.5*(p*(1-cooksx))/cooksx)
negcooks1y = -np.sqrt((p*(1-cooksx))/cooksx)
negcooks05y = -np.sqrt(0.5*(p*(1-cooksx))/cooksx)
ax.plot(cooksx,poscooks1y,label = "Cook's Distance", ls = ':', color = 'r')
ax.plot(cooksx,poscooks05y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks1y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks05y, ls = ':', color = 'r')
ax.plot([0,0],ax.get_ylim(), ls=":", alpha = .3, color = 'k')
ax.plot(ax.get_xlim(), [0,0], ls=":", alpha = .3, color = 'k')
ax.annotate('1.0', xy = (xpos, poscooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, poscooks05y[-1]), color = 'r')
ax.annotate('1.0', xy = (xpos, negcooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, negcooks05y[-1]), color = 'r')
ax.legend()
plt.show()
This plot shows a potential leverage at leverage > 0.175 point which is well outside the other points leverage value
The residual plots do not show many unusually large outliers, but the leverage plot does identify one observation with large leverage
X = auto[input_cols]
x_interaction = PolynomialFeatures(2, interaction_only=True, include_bias=False).fit_transform(X)
interaction_df = pd.DataFrame(x_interaction, index=auto.index, columns = ['cylinders','displacement','horsepower','weight','acceleration','year','origin',
'cylinders:displacement','cylinders:horsepower','cylinders:weight','cylinders:acceleration',
'cylinders:year','cylinders:origin','displacement:horsepower','displacement:weight',
'displacement:acceleration','displacement:year','displacement:origin','horsepower:weight',
'horsepower:acceleration','horsepower:year','horsepower:origin','weight:acceleration',
'weight:year','weight:origin','acceleration:year','acceleration:origin','year:origin'])
y = auto[['mpg']]
interaction_df = sm.add_constant(interaction_df)
model = sm.OLS(y,interaction_df).fit()
predictions = model.predict(interaction_df)
model.summary()
model.pvalues[model.pvalues < 0.05]
In the interaction model above with all of the interactions included the displacement:weight, displacement:origin, and acceleration:origin interactions all appear to be significant. This shows that although origin does not appear to be significant on its own it is important in relationship to displacement and acceleration so it should be included in the model
auto.columns
auto['sqrt_displacement']=auto['displacement']**(1/2)
auto['log_displacement']=np.log(auto['displacement'])
auto['displacement_squared']=auto['displacement']**(2)
fig = px.scatter(auto, x='sqrt_displacement', y='mpg')
fig.show()
fig = px.scatter(auto, x='log_displacement', y='mpg')
fig.show()
fig = px.scatter(auto, x='displacement_squared', y='mpg')
fig.show()
fig = px.scatter(auto, x='displacement', y='mpg')
fig.show()
From the plots above for displacement, sqrt(displacment), log(displacment) and displacement^2, it appears log and square root show more linear relationship with mpg than displacement and dispalcement^2.
carseats = pd.read_csv('Carseats.csv')
carseats['Urban'] = carseats.Urban.map(dict(Yes=1, No=0))
carseats['US'] = carseats.US.map(dict(Yes=1, No=0))
carseat_input_cols = ['Price', 'Urban', 'US']
X = carseats[carseat_input_cols]
y = carseats['Sales']
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
predictions = model.predict(X)
model.summary()
When the other two predictors are held constant, when price is increased by $1, sales dicreases by 0.0545 sales. The binary (yes/no) urban predictor is not statistically significant to sales. Stores in the US sell 1200 more units than non US stores, with all other predictors held constant
Sales = 13.0435 - 0.545 Price - 0.0219 Urban + 1.2 * US + $\epsilon$ where Urban and US are binary (Yes=1 and No=0)
We can reject null hyptohesis for Price and US, as their P-values are less than 0.05
carseat_input_cols = ['Price', 'US']
X = carseats[carseat_input_cols]
y = carseats['Sales']
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
predictions = model.predict(X)
model.summary()
Both have R-squared values of 0.239, meaning they only explain 24% of the response in the prediction.
From model summary in (e): price: [-0.065, -0.044]; US: [0.692, 1.708]
residuals = model.resid
fitted = model.fittedvalues
smoothed = lowess(residuals,fitted)
fig, ax = plt.subplots()
ax.scatter(fitted, residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Residuals')
ax.set_xlabel('Fitted Values')
ax.set_title('Residuals vs. Fitted')
ax.plot([min(fitted),max(fitted)],[0,0],color = 'k',linestyle = ':', alpha = .3)
sorted_student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
sorted_student_residuals.index = model.resid.index
sorted_student_residuals = sorted_student_residuals.sort_values(ascending = True)
df = pd.DataFrame(sorted_student_residuals)
df.columns = ['sorted_student_residuals']
df['theoretical_quantiles'] = stats.probplot(df['sorted_student_residuals'], dist = 'norm', fit = False)[0]
rankings = abs(df['sorted_student_residuals']).sort_values(ascending = False)
fig, ax = plt.subplots()
x = df['theoretical_quantiles']
y = df['sorted_student_residuals']
ax.scatter(x,y, edgecolor = 'k',facecolor = 'none')
ax.set_title('Normal Q-Q')
ax.set_ylabel('Standardized Residuals')
ax.set_xlabel('Theoretical Quantiles')
ax.plot([np.min([x,y]),np.max([x,y])],[np.min([x,y]),np.max([x,y])], color = 'r', ls = '--')
student_residuals = model.get_influence().resid_studentized_internal
sqrt_student_residuals = pd.Series(np.sqrt(np.abs(student_residuals)))
sqrt_student_residuals.index = model.resid.index
smoothed = lowess(sqrt_student_residuals,fitted)
fig, ax = plt.subplots()
ax.scatter(fitted, sqrt_student_residuals, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('$\sqrt{|Studentized \ Residuals|}$')
ax.set_xlabel('Fitted Values')
ax.set_title('Scale-Location')
ax.set_ylim(0,max(sqrt_student_residuals)+0.1)
student_residuals = pd.Series(model.get_influence().resid_studentized_internal)
student_residuals.index = model.resid.index
df = pd.DataFrame(student_residuals)
df.columns = ['student_residuals']
df['leverage'] = model.get_influence().hat_matrix_diag
smoothed = lowess(df['student_residuals'],df['leverage'])
sorted_student_residuals = abs(df['student_residuals']).sort_values(ascending = False)
fig, ax = plt.subplots()
x = df['leverage']
y = df['student_residuals']
xpos = max(x)+max(x)*0.01
ax.scatter(x, y, edgecolors = 'k', facecolors = 'none')
ax.plot(smoothed[:,0],smoothed[:,1],color = 'r')
ax.set_ylabel('Studentized Residuals')
ax.set_xlabel('Leverage')
ax.set_title('Residuals vs. Leverage')
ax.set_ylim(min(y)-min(y)*0.15,max(y)+max(y)*0.15)
ax.set_xlim(-0.01,max(x)+max(x)*0.05)
plt.tight_layout()
cooksx = np.linspace(min(x), xpos, 50)
p = len(model.params)
poscooks1y = np.sqrt((p*(1-cooksx))/cooksx)
poscooks05y = np.sqrt(0.5*(p*(1-cooksx))/cooksx)
negcooks1y = -np.sqrt((p*(1-cooksx))/cooksx)
negcooks05y = -np.sqrt(0.5*(p*(1-cooksx))/cooksx)
ax.plot(cooksx,poscooks1y,label = "Cook's Distance", ls = ':', color = 'r')
ax.plot(cooksx,poscooks05y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks1y, ls = ':', color = 'r')
ax.plot(cooksx,negcooks05y, ls = ':', color = 'r')
ax.plot([0,0],ax.get_ylim(), ls=":", alpha = .3, color = 'k')
ax.plot(ax.get_xlim(), [0,0], ls=":", alpha = .3, color = 'k')
ax.annotate('1.0', xy = (xpos, poscooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, poscooks05y[-1]), color = 'r')
ax.annotate('1.0', xy = (xpos, negcooks1y[-1]), color = 'r')
ax.annotate('0.5', xy = (xpos, negcooks05y[-1]), color = 'r')
ax.legend()
plt.show()
From the Normal Q-Q plot there don't appear to be any outliers and from the Residuals vs. Leverage plot there don't appear to be any leverage points